In [1]:
# import relevant modules
import pandas as pd
pd.set_option('display.max_columns', None)
import glob
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn import cluster
from sklearn.manifold import TSNE
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import sys
sys.path.append('../scripts/')
from querysuggestion import concat_suggestions, vectorize_suggestions
from clustering import kmeans_suggestions, dbscan_suggestions
In [2]:
# set to *.csv to process all
#path_to_csv = '../../data/BTW17_Suggestions/BTW_COMPLETE/*.csv'
#file_list = glob.glob(path_to_csv)

#start = '2017-05-29'
#end = '2017-10-09'
#suggestions_df = concat_suggestions(file_list, start, end)
#print(f'daterange: {suggestions_df["date"].min()}, {suggestions_df["date"].max()}')
In [3]:
# save to parquet
#suggestions_df.to_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')
suggestions_df = pd.read_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')
In [4]:
# tokenize suggestions
suggestions_df['tokens'] = suggestions_df['suggestion'].apply(lambda x: str(x).split(' '))
suggestions_df.head(3)
Out[4]:
date queryterm ranking suggestion tokens
0 2017-05-29 05:00:01 doris ahnen 1.0 ministerin [ministerin]
1 2017-05-29 05:00:01 doris ahnen 2.0 http://de.wikipedia.org/wiki/Doris_Ahnen [http://de.wikipedia.org/wiki/Doris_Ahnen]
2 2017-05-29 05:00:01 doris ahnen 3.0 kinder [kinder]
In [5]:
suggestions, vector_data = vectorize_suggestions(suggestions_df)
In [6]:
# retrieve unique suggestions and their vectors
suggestions = [x for x in suggestions if x]
for i in range(len(suggestions)):
    if suggestions.count(suggestions[i]) > 1:
        suggestions[i] = None
        vector_data[i] = None
        
suggestions = [x for x in suggestions if x]
vector_data = vector_data[~np.isnan(vector_data).any(axis=1)]
vector_data = np.asarray(vector_data)
In [7]:
# tsne transformation to 2d
tsne = TSNE(n_components=2, random_state=1410)
X_tsne = tsne.fit_transform(vector_data)
In [8]:
%reload_ext autoreload
%autoreload 2
from clustering import dbscan_suggestions
In [9]:
dbscan_scores = pd.DataFrame(data=dbscan_suggestions(X_tsne))
In [10]:
dbscan_scores.rename(columns={'eps':'Maximale Distanz', 'min_samples':'Minimale Anzahl Punkte pro Cluster',
                              'silhouette_score':'Silhouette Score',
                              'num_cluster':'Anzahl Cluster', 'num_noise':'Anzahl Rauschpunkte'}, inplace=True)

silhouette_df = pd.crosstab(index=dbscan_scores['Maximale Distanz'], columns=dbscan_scores['Minimale Anzahl Punkte pro Cluster'],
                            values=dbscan_scores['Silhouette Score'], aggfunc='mean')

nnoise_df = pd.crosstab(index=dbscan_scores['Maximale Distanz'], columns=dbscan_scores['Minimale Anzahl Punkte pro Cluster'],
                        values=dbscan_scores['Anzahl Rauschpunkte'], aggfunc='mean')

ncluster_df = pd.crosstab(index=dbscan_scores['Maximale Distanz'], columns=dbscan_scores['Minimale Anzahl Punkte pro Cluster'],
                          values=dbscan_scores['Anzahl Cluster'], aggfunc='mean')

fig = make_subplots(rows=1, cols=3, subplot_titles=('Silhouette Score', 'Anzahl Rauschpunkte', 'Anzahl Cluster'),
                    shared_yaxes=True, horizontal_spacing=0.15)

fig.add_trace(go.Heatmap(z=silhouette_df, x=dbscan_scores['Minimale Anzahl Punkte pro Cluster'].unique(),
                         y=dbscan_scores['Maximale Distanz'].unique(),
                         colorscale=px.colors.sequential.RdBu, colorbar_x=0.233, name='Silhoutte Scores'),
              row=1, col=1)

fig.add_trace(go.Heatmap(z=nnoise_df, x=dbscan_scores['Minimale Anzahl Punkte pro Cluster'].unique(),
                                y=dbscan_scores['Maximale Distanz'].unique(),
                                colorscale=px.colors.sequential.RdBu_r, colorbar_x=0.618, name='Anzahl Rauschpunkte'),
              row=1, col=2)

fig.add_trace(go.Heatmap(z=ncluster_df, x=dbscan_scores['Minimale Anzahl Punkte pro Cluster'].unique(),
                                y=dbscan_scores['Maximale Distanz'].unique(),
                                colorscale=px.colors.sequential.RdBu_r, colorbar_x=1, name='Anzahl Cluster'),
              row=1, col=3)

fig.update_traces(hovertemplate='%{z}')
fig.update_annotations(font_size=18)

fig.update_yaxes(title='Epsilon', row=1, col=1)
fig.update_xaxes(title='Mindestanzahl Punkte pro Cluster', row=1, col=1)
fig.update_xaxes(title='Mindestanzahl Punkte pro Cluster', row=1, col=2)
fig.update_xaxes(title='Mindestanzahl Punkte pro Cluster', row=1, col=3)


fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()
In [39]:
# Methode 1: Anzahl Rauschpunkte nicht höher als 30%, dann nach Anzahl Cluster
dbscan_scores[dbscan_scores['Anzahl Rauschpunkte']<len(X_tsne)*0.3].sort_values(by='Anzahl Cluster', ascending=True)
Out[39]:
Maximale Distanz Minimale Anzahl Punkte pro Cluster Silhouette Score Anzahl Cluster Anzahl Rauschpunkte
200 0.95 7 0.413889 766 6217
189 0.90 7 0.468986 812 6985
199 0.95 6 0.358504 856 4878
188 0.90 6 0.418926 915 5524
198 0.95 5 0.313208 957 3698
177 0.85 6 0.468428 970 6231
187 0.90 5 0.367601 1036 4176
166 0.80 6 0.510580 1037 6938
176 0.85 5 0.413622 1127 4729
165 0.80 5 0.467995 1217 5270
154 0.75 5 0.515935 1276 6017
143 0.70 5 0.551904 1337 6831
In [38]:
# Methode 2: Anzahl Cluster nicht höher als 200, dann nach Anzahl Rauschpunkte
dbscan_scores[dbscan_scores['Anzahl Cluster']<200].sort_values(by='Anzahl Rauschpunkte', ascending=True)
Out[38]:
Maximale Distanz Minimale Anzahl Punkte pro Cluster Silhouette Score Anzahl Cluster Anzahl Rauschpunkte
175 0.80 15 0.807362 194 19063
152 0.70 14 0.838935 182 19852
164 0.75 15 0.835114 167 19894
140 0.65 13 0.855534 192 19927
141 0.65 14 0.852699 162 20468
... ... ... ... ... ...
21 0.10 15 0.998876 4 23911
7 0.05 12 0.999134 4 23924
8 0.05 13 0.999384 3 23938
9 0.05 14 1.000000 1 23964
10 0.05 15 1.000000 0 23978

88 rows × 5 columns

In [43]:
from sklearn import cluster, metrics
dbscan = cluster.DBSCAN(eps=0.95, min_samples=7).fit(X_tsne)
labels = dbscan.labels_
# drop noise points from labels
tmp = pd.DataFrame()
tmp['labels'] = labels
tmp['vector'] = X_tsne.tolist()
tmp = tmp[tmp['labels']!=-1]
labels_clean = tmp['labels'].tolist()
vectors_clean = np.array(tmp['vector'].tolist())

n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)

print(f'Silhouette Score w/o noise points: {metrics.silhouette_score(vectors_clean, labels_clean)}')
print(f'Estimated number of clusters: {n_clusters}')
print(f'Estimated number of noise points: {n_noise}')
print(f'Noise in percent: {n_noise/len(labels)*100}%')
Silhouette Score w/o noise points: 0.41388864515738666
Estimated number of clusters: 766
Estimated number of noise points: 6217
Noise in percent: 25.927933939444493%
In [44]:
# create output df and plot
output_df = pd.DataFrame(X_tsne, columns=['t-SNE(x)', 't-SNE(y)'])
output_df['suggestion'] = suggestions
output_df['cluster'] = labels
output_df.sort_values(by='cluster', inplace=True, ignore_index=True)
output_df['vector'] = [x for x in X_tsne]
output_df['cluster'] = output_df['cluster'].apply(str)

# save output df
output_df.to_json('../../data/BTW17_Suggestions/suggestions/cluster.json')
In [45]:
output_df.rename(columns={'cluster':'Cluster', 'suggestion':'Suggestion'}, inplace=True)
fig = px.scatter(output_df, x='t-SNE(x)', y='t-SNE(y)', color='Cluster', hover_name='Suggestion',
                 template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()
In [46]:
tmp = pd.DataFrame()
tmp['Cluster'] = output_df['Cluster'].value_counts().index
tmp['Clustergröße'] = output_df['Cluster'].value_counts().values
fig = px.box(tmp[tmp['Cluster']!='-1'], y='Clustergröße', points='all',
             template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()